# System
from time import time
# Data
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from yellowbrick.contrib.missing import MissingValuesBar
%matplotlib inline
# Modeling
from pycaret.classification import *
# Explainability
import shap
C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\site-packages\sklearn\utils\deprecation.py:143: FutureWarning: The sklearn.metrics.classification module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API. warnings.warn(message, FutureWarning)
data = pd.read_csv("../../data/final_project.csv")
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 160000 entries, 0 to 159999 Data columns (total 51 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 x0 159974 non-null float64 1 x1 159975 non-null float64 2 x2 159962 non-null float64 3 x3 159963 non-null float64 4 x4 159974 non-null float64 5 x5 159963 non-null float64 6 x6 159974 non-null float64 7 x7 159973 non-null float64 8 x8 159979 non-null float64 9 x9 159970 non-null float64 10 x10 159957 non-null float64 11 x11 159970 non-null float64 12 x12 159964 non-null float64 13 x13 159969 non-null float64 14 x14 159966 non-null float64 15 x15 159965 non-null float64 16 x16 159974 non-null float64 17 x17 159973 non-null float64 18 x18 159960 non-null float64 19 x19 159965 non-null float64 20 x20 159962 non-null float64 21 x21 159971 non-null float64 22 x22 159973 non-null float64 23 x23 159953 non-null float64 24 x24 159972 non-null object 25 x25 159978 non-null float64 26 x26 159964 non-null float64 27 x27 159970 non-null float64 28 x28 159965 non-null float64 29 x29 159970 non-null object 30 x30 159970 non-null object 31 x31 159961 non-null float64 32 x32 159969 non-null object 33 x33 159959 non-null float64 34 x34 159959 non-null float64 35 x35 159970 non-null float64 36 x36 159973 non-null float64 37 x37 159977 non-null object 38 x38 159969 non-null float64 39 x39 159977 non-null float64 40 x40 159964 non-null float64 41 x41 159960 non-null float64 42 x42 159974 non-null float64 43 x43 159963 non-null float64 44 x44 159960 non-null float64 45 x45 159971 non-null float64 46 x46 159969 non-null float64 47 x47 159963 non-null float64 48 x48 159968 non-null float64 49 x49 159968 non-null float64 50 y 160000 non-null int64 dtypes: float64(45), int64(1), object(5) memory usage: 62.3+ MB
features = data.columns[pd.Series(data.columns).str.startswith('x')].to_list()
target = data.columns[pd.Series(data.columns).str.startswith('y')].to_list()
print(f"Features: Length: {len(features)}, Values: {features} ")
print(f"Target: {target}")
Features: Length: 50, Values: ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40', 'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49'] Target: ['y']
for feature in data[features].select_dtypes('object').columns:
print(f"\nFeature: {feature}")
print(data[feature].value_counts())
Feature: x24
asia 138965
euorpe 16538
america 4469
Name: x24, dtype: int64
Feature: x29
July 45569
Jun 41329
Aug 29406
May 21939
sept. 10819
Apr 6761
Oct 2407
Mar 1231
Nov 337
Feb 140
Dev 23
January 9
Name: x29, dtype: int64
Feature: x30
wednesday 101535
thurday 29429
tuesday 27954
friday 564
monday 488
Name: x30, dtype: int64
Feature: x32
0.01% 40767
-0.01% 34094
0.0% 33923
-0.0% 30492
-0.02% 9924
0.02% 7987
-0.03% 1727
0.03% 855
-0.04% 138
0.04% 55
-0.05% 6
0.05% 1
Name: x32, dtype: int64
Feature: x37
$237.4 6
$72.42 6
$-415.46 6
$-311.26 6
$341.26 6
..
$-600.93 1
$1173.98 1
$605.4 1
$62.79 1
$-780.9 1
Name: x37, Length: 129198, dtype: int64
data['x32'].replace(to_replace="%$", value="", regex=True, inplace=True)
data['x32'] = data['x32'].astype(float)
data['x37'].replace(to_replace="^\$", value="", regex=True, inplace=True)
data['x37'] = data['x37'].astype(float)
features_categorical = data[features].select_dtypes('object').columns
features_numerical = [feature for feature in features if feature not in features_categorical]
print(f"Features Categorical: Length: {len(features_categorical)}, Values: {features_categorical} ")
print(f"Features Numeric: Length: {len(features_numerical)}, Values: {features_numerical} ")
Features Categorical: Length: 3, Values: Index(['x24', 'x29', 'x30'], dtype='object') Features Numeric: Length: 47, Values: ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x25', 'x26', 'x27', 'x28', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40', 'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49']
# # Instantiate the visualizer
# plt.figure(figsize=(15,10))
# visualizer = MissingValuesBar(features=features)
# visualizer.fit(X=data[features], y=data[target].values) # Supply the targets via y
# _ = visualizer.show() # Finalize and render the figure
# Instantiate the visualizer
plt.figure(figsize=(15,10))
visualizer = MissingValuesBar(features=features_numerical)
visualizer.fit(X=data[features_numerical], y=data[target].values) # Supply the targets via y
_ = visualizer.show() # Finalize and render the figure
data.dropna(inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 158392 entries, 0 to 159999 Data columns (total 51 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 x0 158392 non-null float64 1 x1 158392 non-null float64 2 x2 158392 non-null float64 3 x3 158392 non-null float64 4 x4 158392 non-null float64 5 x5 158392 non-null float64 6 x6 158392 non-null float64 7 x7 158392 non-null float64 8 x8 158392 non-null float64 9 x9 158392 non-null float64 10 x10 158392 non-null float64 11 x11 158392 non-null float64 12 x12 158392 non-null float64 13 x13 158392 non-null float64 14 x14 158392 non-null float64 15 x15 158392 non-null float64 16 x16 158392 non-null float64 17 x17 158392 non-null float64 18 x18 158392 non-null float64 19 x19 158392 non-null float64 20 x20 158392 non-null float64 21 x21 158392 non-null float64 22 x22 158392 non-null float64 23 x23 158392 non-null float64 24 x24 158392 non-null object 25 x25 158392 non-null float64 26 x26 158392 non-null float64 27 x27 158392 non-null float64 28 x28 158392 non-null float64 29 x29 158392 non-null object 30 x30 158392 non-null object 31 x31 158392 non-null float64 32 x32 158392 non-null float64 33 x33 158392 non-null float64 34 x34 158392 non-null float64 35 x35 158392 non-null float64 36 x36 158392 non-null float64 37 x37 158392 non-null float64 38 x38 158392 non-null float64 39 x39 158392 non-null float64 40 x40 158392 non-null float64 41 x41 158392 non-null float64 42 x42 158392 non-null float64 43 x43 158392 non-null float64 44 x44 158392 non-null float64 45 x45 158392 non-null float64 46 x46 158392 non-null float64 47 x47 158392 non-null float64 48 x48 158392 non-null float64 49 x49 158392 non-null float64 50 y 158392 non-null int64 dtypes: float64(47), int64(1), object(3) memory usage: 62.8+ MB
plt.figure(figsize=(20,10))
sns.heatmap(data.corr())
<AxesSubplot:>
sns.relplot(x='x2', y = 'x6', data=data)
<seaborn.axisgrid.FacetGrid at 0x140faa2e2c8>
sns.relplot(x='x38', y = 'x41', data=data)
<seaborn.axisgrid.FacetGrid at 0x1408339fe08>
data.drop(columns=['x2', 'x6'], inplace=True)
features = data.columns[pd.Series(data.columns).str.startswith('x')].to_list()
features_categorical = data[features].select_dtypes('object').columns
features_numerical = [feature for feature in features if feature not in features_categorical]
print(f"Features Categorical: Length: {len(features_categorical)}, Values: {features_categorical} ")
print(f"Features Numeric: Length: {len(features_numerical)}, Values: {features_numerical} ")
Features Categorical: Length: 3, Values: Index(['x24', 'x29', 'x30'], dtype='object') Features Numeric: Length: 45, Values: ['x0', 'x1', 'x3', 'x4', 'x5', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x25', 'x26', 'x27', 'x28', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40', 'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49']
Relation to target variable seems to be weak at best. But this may not be the right way to visualize. We should do histograms faceted by target
np.abs(data.corr()['y']).sort_values(ascending=False)
y 1.000000 x20 0.241660 x23 0.237010 x49 0.222356 x40 0.191780 x42 0.124892 x41 0.122538 x38 0.122538 x12 0.106962 x37 0.054629 x46 0.040904 x7 0.014671 x8 0.006446 x32 0.005654 x34 0.005198 x16 0.004468 x17 0.004135 x27 0.003836 x10 0.003481 x1 0.003381 x48 0.003198 x45 0.003114 x21 0.003075 x18 0.002833 x9 0.002749 x39 0.002341 x35 0.002246 x26 0.001995 x13 0.001935 x3 0.001870 x25 0.001870 x19 0.001670 x15 0.001599 x0 0.001581 x33 0.001529 x31 0.001499 x36 0.001453 x4 0.001050 x5 0.000966 x11 0.000786 x14 0.000780 x28 0.000632 x47 0.000542 x22 0.000127 x44 0.000121 x43 0.000088 Name: y, dtype: float64
_ = sns.histplot(data=data, x="x20", hue="y")
_ = sns.histplot(data=data, x="x23", hue="y")
_ = sns.histplot(data=data, x="x49", hue="y")
# https://github.com/AutoViML/AutoViz
from autoviz.AutoViz_Class import AutoViz_Class
Imported AutoViz_Class version: 0.0.79. Call using:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=0,
lowess=False,chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30)
Note: verbose=0 or 1 generates charts and displays them in your local Jupyter notebook.
verbose=2 saves plots in your local machine under AutoViz_Plots directory and does not display charts.
AV = AutoViz_Class()
# ?AV.AutoViz
# ?data.sample
data_subset = data.sample(frac=0.1, random_state=42)
filename = ""
dft = AV.AutoViz(
filename="",
sep=",",
depVar="y",
dfte=data_subset,
header=0,
verbose=0,
lowess=False,
chart_format="svg",
max_rows_analyzed=10000000,
max_cols_analyzed=100,
)
Shape of your Data Set: (15839, 49)
############## C L A S S I F Y I N G V A R I A B L E S ####################
Classifying variables in data set...
Number of Numeric Columns = 45
Number of Integer-Categorical Columns = 0
Number of String-Categorical Columns = 3
Number of Factor-Categorical Columns = 0
Number of String-Boolean Columns = 0
Number of Numeric-Boolean Columns = 0
Number of Discrete String Columns = 0
Number of NLP String Columns = 0
Number of Date Time Columns = 0
Number of ID Columns = 0
Number of Columns to Delete = 0
48 Predictors classified...
This does not include the Target column(s)
No variables removed since no ID or low-information variables found in data set
################### Binary-Class VISUALIZATION Started #####################
Total Number of Scatter Plots = 1035
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) ~\.conda\envs\ds7337_cs3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj) 339 pass 340 else: --> 341 return printer(obj) 342 # Finally look for special method names 343 method = get_real_method(obj, self.print_method) ~\.conda\envs\ds7337_cs3\lib\site-packages\IPython\core\pylabtools.py in <lambda>(fig) 246 247 if 'png' in formats: --> 248 png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs)) 249 if 'retina' in formats or 'png2x' in formats: 250 png_formatter.for_type(Figure, lambda fig: retina_figure(fig, **kwargs)) ~\.conda\envs\ds7337_cs3\lib\site-packages\IPython\core\pylabtools.py in print_figure(fig, fmt, bbox_inches, **kwargs) 130 FigureCanvasBase(fig) 131 --> 132 fig.canvas.print_figure(bytes_io, **kw) 133 data = bytes_io.getvalue() 134 if fmt == 'svg': ~\.conda\envs\ds7337_cs3\lib\site-packages\matplotlib\backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, pad_inches, bbox_extra_artists, backend, **kwargs) 2185 self.figure, 2186 functools.partial( -> 2187 print_method, orientation=orientation) 2188 ) 2189 ctx = (renderer._draw_disabled() ~\.conda\envs\ds7337_cs3\lib\site-packages\matplotlib\backend_bases.py in _get_renderer(figure, print_method) 1553 figure.canvas._get_output_canvas(None, fmt), f"print_{fmt}") 1554 try: -> 1555 print_method(io.BytesIO(), dpi=figure.dpi) 1556 except Done as exc: 1557 renderer, = figure._cachedRenderer, = exc.args ~\.conda\envs\ds7337_cs3\lib\site-packages\matplotlib\backend_bases.py in wrapper(*args, **kwargs) 1637 kwargs.pop(arg) 1638 -> 1639 return func(*args, **kwargs) 1640 1641 return wrapper ~\.conda\envs\ds7337_cs3\lib\site-packages\matplotlib\backends\backend_agg.py in print_png(self, filename_or_obj, metadata, pil_kwargs, *args) 507 *metadata*, including the default 'Software' key. 508 """ --> 509 FigureCanvasAgg.draw(self) 510 mpl.image.imsave( 511 filename_or_obj, self.buffer_rgba(), format="png", origin="upper", ~\.conda\envs\ds7337_cs3\lib\site-packages\matplotlib\backends\backend_agg.py in draw(self) 400 def draw(self): 401 # docstring inherited --> 402 self.renderer = self.get_renderer(cleared=True) 403 # Acquire a lock on the shared font cache. 404 with RendererAgg.lock, \ ~\.conda\envs\ds7337_cs3\lib\site-packages\matplotlib\backends\backend_agg.py in get_renderer(self, cleared) 416 and getattr(self, "_lastKey", None) == key) 417 if not reuse_renderer: --> 418 self.renderer = RendererAgg(w, h, self.figure.dpi) 419 self._lastKey = key 420 elif cleared: ~\.conda\envs\ds7337_cs3\lib\site-packages\matplotlib\backends\backend_agg.py in __init__(self, width, height, dpi) 94 self.width = width 95 self.height = height ---> 96 self._renderer = _RendererAgg(int(width), int(height), dpi) 97 self._filter_renderers = [] 98 ValueError: Image size of 1080x142560 pixels is too large. It must be less than 2^16 in each direction.
<Figure size 1080x142560 with 990 Axes>
Time to run AutoViz (in seconds) = 211.538 ###################### VISUALIZATION Completed ########################
X = data[features]
y = data[target]
print(X.shape, y.shape)
(158392, 48) (158392, 1)
train = data.sample(frac=0.80, random_state=42)
test = data.drop(train.index)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(train.shape))
print('Unseen Data For Predictions: ' + str(test.shape))
Data for Modeling: (126714, 49) Unseen Data For Predictions: (31678, 49)
exp_01 = setup(
data=train,
target='y',
train_size=0.8,
data_split_stratify=True,
fold=3,
session_id=42,
log_experiment=True,
use_gpu=True
)
# Polynomial Features
# Feature Selection?
# combine_rare_levels
# pca
| Description | Value | |
|---|---|---|
| 0 | session_id | 42 |
| 1 | Target | y |
| 2 | Target Type | Binary |
| 3 | Label Encoded | 0: 0, 1: 1 |
| 4 | Original Data | (126714, 49) |
| 5 | Missing Values | False |
| 6 | Numeric Features | 45 |
| 7 | Categorical Features | 3 |
| 8 | Ordinal Features | False |
| 9 | High Cardinality Features | False |
| 10 | High Cardinality Method | None |
| 11 | Transformed Train Set | (101371, 64) |
| 12 | Transformed Test Set | (25343, 64) |
| 13 | Shuffle Train-Test | True |
| 14 | Stratify Train-Test | True |
| 15 | Fold Generator | StratifiedKFold |
| 16 | Fold Number | 3 |
| 17 | CPU Jobs | -1 |
| 18 | Use GPU | True |
| 19 | Log Experiment | True |
| 20 | Experiment Name | clf-default-name |
| 21 | USI | 1249 |
| 22 | Imputation Type | simple |
| 23 | Iterative Imputation Iteration | None |
| 24 | Numeric Imputer | mean |
| 25 | Iterative Imputation Numeric Model | None |
| 26 | Categorical Imputer | constant |
| 27 | Iterative Imputation Categorical Model | None |
| 28 | Unknown Categoricals Handling | least_frequent |
| 29 | Normalize | False |
| 30 | Normalize Method | None |
| 31 | Transformation | False |
| 32 | Transformation Method | None |
| 33 | PCA | False |
| 34 | PCA Method | None |
| 35 | PCA Components | None |
| 36 | Ignore Low Variance | False |
| 37 | Combine Rare Levels | False |
| 38 | Rare Level Threshold | None |
| 39 | Numeric Binning | False |
| 40 | Remove Outliers | False |
| 41 | Outliers Threshold | None |
| 42 | Remove Multicollinearity | False |
| 43 | Multicollinearity Threshold | None |
| 44 | Clustering | False |
| 45 | Clustering Iteration | None |
| 46 | Polynomial Features | False |
| 47 | Polynomial Degree | None |
| 48 | Trignometry Features | False |
| 49 | Polynomial Threshold | None |
| 50 | Group Features | False |
| 51 | Feature Selection | False |
| 52 | Features Selection Threshold | None |
| 53 | Feature Interaction | False |
| 54 | Feature Ratio | False |
| 55 | Interaction Threshold | None |
| 56 | Fix Imbalance | False |
| 57 | Fix Imbalance Method | SMOTE |
# check all metrics used for model evaluation
get_metrics()
| Name | Display Name | Score Function | Scorer | Target | Args | Greater is Better | Multiclass | Custom | |
|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||
| acc | Accuracy | Accuracy | <function accuracy_score at 0x00000140F7597AF8> | accuracy | pred | {} | True | True | False |
| auc | AUC | AUC | <function roc_auc_score at 0x00000140F7585E58> | make_scorer(roc_auc_score, needs_proba=True, e... | pred_proba | {'average': 'weighted', 'multi_class': 'ovr'} | True | True | False |
| recall | Recall | Recall | <function binary_multiclass_score_func.<locals... | make_scorer(wrapper, average=macro) | pred | {'average': 'macro'} | True | True | False |
| precision | Precision | Prec. | <function binary_multiclass_score_func.<locals... | make_scorer(wrapper, average=weighted) | pred | {'average': 'weighted'} | True | True | False |
| f1 | F1 | F1 | <function binary_multiclass_score_func.<locals... | make_scorer(wrapper, average=weighted) | pred | {'average': 'weighted'} | True | True | False |
| kappa | Kappa | Kappa | <function cohen_kappa_score at 0x00000140F759C... | make_scorer(cohen_kappa_score) | pred | {} | True | True | False |
| mcc | MCC | MCC | <function matthews_corrcoef at 0x00000140F759C... | make_scorer(matthews_corrcoef) | pred | {} | True | True | False |
def single_instance_metric(row):
if row['y_test'] == 0 and row['y_pred'] == 1: # False Positive
return 10
elif row['y_test'] == 1 and row['y_pred'] == 0: # False Negative
return 500
else: # Correct Predictions
return 0
def fp10_fn500_func(y_test, y_pred):
df = pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})
df['metric'] = df.apply(single_instance_metric, axis=1)
return np.mean(df['metric'].values)
y_true = [1, 0, 1]
y_pred = [0, 0, 1]
fp10_fn500_func(y_true, y_pred)
166.66666666666666
add_metric(
id='fp10_fn500',
name='fp10_fn500',
score_func=fp10_fn500_func,
target='pred',
greater_is_better=False)
Name fp10_fn500
Display Name fp10_fn500
Score Function <function fp10_fn500_func at 0x00000140C79DE3A8>
Scorer make_scorer(fp10_fn500_func, greater_is_better...
Target pred
Args {}
Greater is Better False
Multiclass True
Custom True
Name: fp10_fn500, dtype: object
# remove_metric('fp10_fn500')
get_metrics()
| Name | Display Name | Score Function | Scorer | Target | Args | Greater is Better | Multiclass | Custom | |
|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||
| acc | Accuracy | Accuracy | <function accuracy_score at 0x00000140F7597AF8> | accuracy | pred | {} | True | True | False |
| auc | AUC | AUC | <function roc_auc_score at 0x00000140F7585E58> | make_scorer(roc_auc_score, needs_proba=True, e... | pred_proba | {'average': 'weighted', 'multi_class': 'ovr'} | True | True | False |
| recall | Recall | Recall | <function binary_multiclass_score_func.<locals... | make_scorer(wrapper, average=macro) | pred | {'average': 'macro'} | True | True | False |
| precision | Precision | Prec. | <function binary_multiclass_score_func.<locals... | make_scorer(wrapper, average=weighted) | pred | {'average': 'weighted'} | True | True | False |
| f1 | F1 | F1 | <function binary_multiclass_score_func.<locals... | make_scorer(wrapper, average=weighted) | pred | {'average': 'weighted'} | True | True | False |
| kappa | Kappa | Kappa | <function cohen_kappa_score at 0x00000140F759C... | make_scorer(cohen_kappa_score) | pred | {} | True | True | False |
| mcc | MCC | MCC | <function matthews_corrcoef at 0x00000140F759C... | make_scorer(matthews_corrcoef) | pred | {} | True | True | False |
| fp10_fn500 | fp10_fn500 | fp10_fn500 | <function fp10_fn500_func at 0x00000140C79DE3A8> | make_scorer(fp10_fn500_func, greater_is_better... | pred | {} | False | True | True |
# ?compare_models
# models()
start = time()
best_model = compare_models(
sort='fp10_fn500',
exclude = ['gbc', 'ada', 'catboost'], # Slow Model(s)
turbo=True # Dont run slow models
)
end = time()
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | fp10_fn500 | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|---|
| xgboost | Extreme Gradient Boosting | 0.9213 | 0.9740 | 0.8887 | 0.9130 | 0.9007 | 0.8356 | 0.8358 | 22.6837 | 5.5067 |
| lightgbm | Light Gradient Boosting Machine | 0.9064 | 0.9660 | 0.8595 | 0.9028 | 0.8806 | 0.8037 | 0.8044 | 28.5896 | 2.7133 |
| rf | Random Forest Classifier | 0.9114 | 0.9687 | 0.8483 | 0.9249 | 0.8850 | 0.8132 | 0.8152 | 30.7290 | 18.2467 |
| et | Extra Trees Classifier | 0.8995 | 0.9651 | 0.8132 | 0.9277 | 0.8667 | 0.7866 | 0.7910 | 37.7556 | 21.0567 |
| dt | Decision Tree Classifier | 0.8322 | 0.8258 | 0.7930 | 0.7900 | 0.7915 | 0.6511 | 0.6511 | 42.4117 | 8.6367 |
| knn | K Neighbors Classifier | 0.8015 | 0.8670 | 0.7387 | 0.7601 | 0.7493 | 0.5850 | 0.5852 | 53.3970 | 14.6433 |
| nb | Naive Bayes | 0.6866 | 0.7358 | 0.5874 | 0.6149 | 0.6008 | 0.3431 | 0.3433 | 84.3262 | 1.0867 |
| lr | Logistic Regression | 0.7031 | 0.7597 | 0.5185 | 0.6679 | 0.5838 | 0.3586 | 0.3656 | 97.7202 | 10.6267 |
| lda | Linear Discriminant Analysis | 0.7023 | 0.7595 | 0.5104 | 0.6697 | 0.5793 | 0.3553 | 0.3632 | 99.3280 | 1.8333 |
| ridge | Ridge Classifier | 0.7018 | 0.0000 | 0.5057 | 0.6707 | 0.5766 | 0.3534 | 0.3618 | 100.2466 | 1.1133 |
| svm | SVM - Linear Kernel | 0.6003 | 0.0000 | 0.5088 | 0.5025 | 0.5055 | 0.1701 | 0.1702 | 100.6622 | 8.2833 |
| qda | Quadratic Discriminant Analysis | 0.5032 | 0.5022 | 0.4972 | 0.4041 | 0.4285 | 0.0043 | 0.0047 | 103.9145 | 1.3067 |
print(f"Training Time: {end-start}s")
Training Time: 333.8759262561798s
# # pull the score grid of compare_models as a dataframe
# pull()
print(best_model)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan,
monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
n_estimators=100, n_jobs=-1, num_parallel_tree=1,
objective='binary:logistic', random_state=42, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='gpu_hist', validate_parameters=1, verbosity=0)
plot_model(best_model, plot='feature')
plot_model(best_model, plot = 'confusion_matrix')
# SHAPLEY Explanation:
# https://www.analyticsvidhya.com/blog/2019/11/shapley-value-machine-learning-interpretability-game-theory/
# This shows the Shap values on the x-axis.
# Here, all the values on the left represent the observations that shift the predicted value in the negative direction
# while the points on the right contribute to shifting the prediction in a positive direction.
# All the features are on the left y-axis.
interpret_model(best_model, plot = 'summary')
# https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d
# This plot is made of all the dots in the train data. It demonstrates the following information:
# Feature importance: Variables are ranked in descending order.
# Impact: The horizontal location shows whether the effect of that value is associated with a higher or lower prediction.
# Original value: Color shows whether that variable is high (in red) or low (in blue) for that observation.
# Correlation:
# - A high level of the “x23” content has a high and negative impact on the 'y'.
# - The “high” comes from the red color, and the “negative” impact is shown on the X-axis.
# - Similarly, we will say the “x49” is positively correlated with the target variable.
help(get_config)
Help on function get_config in module pycaret.classification:
get_config(variable: str)
This function retrieves the global variables created when initializing the
``setup`` function. Following variables are accessible:
- X: Transformed dataset (X)
- y: Transformed dataset (y)
- X_train: Transformed train dataset (X)
- X_test: Transformed test/holdout dataset (X)
- y_train: Transformed train dataset (y)
- y_test: Transformed test/holdout dataset (y)
- seed: random state set through session_id
- prep_pipe: Transformation pipeline
- fold_shuffle_param: shuffle parameter used in Kfolds
- n_jobs_param: n_jobs parameter used in model training
- html_param: html_param configured through setup
- create_model_container: results grid storage container
- master_model_container: model storage container
- display_container: results display container
- exp_name_log: Name of experiment
- logging_param: log_experiment param
- log_plots_param: log_plots param
- USI: Unique session ID parameter
- fix_imbalance_param: fix_imbalance param
- fix_imbalance_method_param: fix_imbalance_method param
- data_before_preprocess: data before preprocessing
- target_param: name of target variable
- gpu_param: use_gpu param configured through setup
- fold_generator: CV splitter configured in fold_strategy
- fold_param: fold params defined in the setup
- fold_groups_param: fold groups defined in the setup
- stratify_param: stratify parameter defined in the setup
Example
-------
>>> from pycaret.datasets import get_data
>>> juice = get_data('juice')
>>> from pycaret.classification import *
>>> exp_name = setup(data = juice, target = 'Purchase')
>>> X_train = get_config('X_train')
Returns:
Global variable
X_train_transformed= get_config('X_train')
# X_train_transformed.info()
# https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d
# This is slightly different from the plot above which is probably using a different method for feature importance
shap_values = shap.TreeExplainer(best_model).shap_values(X_train_transformed)
shap.summary_plot(shap_values, X_train_transformed, plot_type="bar")
# Same as above but since the shapley values are calculated using a different method (most likely),
# the results are slightly different
shap.summary_plot(shap_values, X_train_transformed)
# Simplified Version of the above plot
def ABS_SHAP(df_shap,df, figsize=(5,6)):
# Make a copy of the input data
shap_v = pd.DataFrame(df_shap)
feature_list = df.columns
shap_v.columns = feature_list
df_v = df.copy().reset_index().drop('index',axis=1)
# Determine the correlation in order to plot with different colors
corr_list = list()
for i in feature_list:
b = np.corrcoef(shap_v[i],df_v[i])[1][0]
corr_list.append(b)
corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
# Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
corr_df.columns = ['Variable','Corr']
corr_df['Sign'] = np.where(corr_df['Corr']>0,'red','blue')
# Plot it
shap_abs = np.abs(shap_v)
k=pd.DataFrame(shap_abs.mean()).reset_index()
k.columns = ['Variable','SHAP_abs']
k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
k2 = k2.sort_values(by='SHAP_abs',ascending = True)
colorlist = k2['Sign']
ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=figsize,legend=False)
ax.set_xlabel("SHAP Value (Red = Positive Impact)")
ABS_SHAP(shap_values,X_train_transformed, figsize=(10,10))
https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d
# Shows the negative correlation of 'x23'
# Also shows that 'x23' is most highly interacting with x37 (color)
shap.dependence_plot("x23", shap_values, X_train_transformed)
shap.dependence_plot("x37", shap_values, X_train_transformed)
shap.dependence_plot("x49", shap_values, X_train_transformed)
shap.dependence_plot("x27", shap_values, X_train_transformed)
# This is interesting. It states that x27 has a mixed correlation to the output.
# When x42 is high (red), x27 has a negative correlation with 'y' (y-axis vs x-axis)
# When x42 is low (blue), x27 has a positive correlation with 'y' (y-axis vs x-axis)
# It is important to note that "Correlaion is not the same as Causation"
X_test_transformed= get_config('X_test')
# X_test_transformed.info()
# predict_model(best_model, data=X_test_transformed)
# Get the predictions and put them with the test data.
X_output = X_test_transformed.copy()
# X_output.loc[:,'predict'] = np.round(best_model.predict(X_output),2)
# Randomly pick some observations
random_picks = np.arange(1,330,50) # Every 50 rows
S = X_output.iloc[random_picks]
S
| x0 | x1 | x3 | x4 | x5 | x7 | x8 | x9 | x10 | x11 | ... | x29_Mar | x29_May | x29_Nov | x29_Oct | x29_sept. | x30_friday | x30_monday | x30_thurday | x30_tuesday | x30_wednesday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 65994 | -0.089026 | 4.739374 | -7.616577 | -2.017990 | 10.699079 | -6.058542 | -10.889321 | -2.129038 | 6.684869 | 3.739051 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 101634 | -0.372234 | -1.540867 | -2.838008 | 1.262614 | -13.563667 | -33.387394 | -7.553047 | -3.021774 | 5.033779 | 4.916846 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 108956 | 0.320491 | 0.354088 | -14.158455 | -17.671486 | -3.692270 | -4.049163 | 13.038754 | 5.734336 | -13.927984 | 2.710987 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 78612 | 0.354782 | 4.481230 | -15.937026 | -2.018381 | -11.890825 | -73.281990 | -0.053412 | -0.848773 | 12.343312 | -6.782266 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 57291 | 0.739714 | -3.030419 | 6.457521 | -9.965528 | -6.408630 | 46.128246 | -9.660893 | -7.095620 | -1.111253 | -7.256639 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 20386 | -0.185871 | 11.293899 | 5.748356 | 9.490390 | 3.082972 | -40.673294 | 16.613768 | 6.402008 | 2.565773 | -5.854484 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 105607 | 0.028082 | 2.904887 | 6.538821 | 4.733573 | 1.664429 | -20.943357 | -9.667617 | 3.724478 | 13.791682 | -0.965256 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
7 rows × 64 columns
# https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d
# Initialize your Jupyter notebook with initjs(), otherwise you will get an error message.
shap.initjs()
# Write in a function
def shap_plot(j):
explainerModel = shap.TreeExplainer(best_model)
shap_values_Model = explainerModel.shap_values(S)
p = shap.force_plot(explainerModel.expected_value, shap_values_Model[j], S.iloc[[j]])
return(p)
# Let me walk you through the above code step by step.
# The above shap.force_plot() takes three values:
# - the base value (explainerModel.expected_value[0])
# - the SHAP values (shap_values_Model[j][0]) and
# - the matrix of feature values (S.iloc[[j]]).
# The base value or the expected value is the average of the model output over the training data X_train.
# It is the base value used in the following plot.
shap_plot(0)
Let me describe this elegant plot in great detail:
x23 is negatively correlated to y. mean(x23) = 0.731136. In this obs, x23 = -27.89. Since this is below the mean, and it is negatively correlated, it ends pushing the final prediction value higher.x7 is also negatively correlated to y. mean(x7) = -7.74. In this obs, x7 = -6.059 which is above average, hence it is pushing the output lower.
You may wonder how we know the average values of the predictors. Remember the SHAP model is built on the training data set. The means of the variables are: X_train.mean()y_test= get_config('y_test')
y_test.mean()
0.4016099120072604
# Mean Train Values used in explaining above
pd.DataFrame(X_train_transformed.mean()).T
| x0 | x1 | x3 | x4 | x5 | x7 | x8 | x9 | x10 | x11 | ... | x29_Mar | x29_May | x29_Nov | x29_Oct | x29_sept. | x30_friday | x30_monday | x30_thurday | x30_tuesday | x30_wednesday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.000279 | 0.008154 | -0.030629 | -0.02018 | 0.001805 | -7.741145 | -0.018226 | 0.017265 | -0.005323 | 0.042266 | ... | 0.007586 | 0.136242 | 0.002121 | 0.014985 | 0.067406 | 0.003522 | 0.003137 | 0.18378 | 0.174488 | 0.635073 |
1 rows × 64 columns
final_model_baseline = finalize_model(best_model)
unseen_predictions = predict_model(final_model_baseline, data=test)
unseen_predictions.head()
| x0 | x1 | x3 | x4 | x5 | x7 | x8 | x9 | x10 | x11 | ... | x43 | x44 | x45 | x46 | x47 | x48 | x49 | y | Label | Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.226706 | 11.350364 | 5.182092 | -2.236454 | 10.716248 | -15.900329 | -0.178002 | 10.901306 | -0.090170 | -6.062488 | ... | -2.064860 | 0.923879 | 0.331452 | 19.172365 | 5.752749 | -2.609553 | -20.320179 | 0 | 0 | 0.9975 |
| 1 | -0.333224 | 1.138614 | 0.104432 | 0.819080 | 6.936425 | 37.988706 | 4.517952 | 17.467962 | 6.692915 | -21.920142 | ... | 1.938380 | -5.395413 | 0.459957 | -77.491333 | 0.754309 | -0.442017 | -21.824215 | 1 | 1 | 0.7324 |
| 2 | -0.479265 | -1.085311 | 5.535042 | -5.157181 | -1.608268 | -33.507600 | 26.144811 | -1.866048 | 3.351425 | -3.272850 | ... | -0.984782 | 1.954579 | 0.383549 | -1.467533 | 13.830803 | -2.831817 | 9.343167 | 0 | 0 | 0.9914 |
| 3 | 0.140972 | -13.770815 | 1.282171 | -10.314443 | -2.140678 | -32.697867 | -12.569104 | -0.510527 | -0.622231 | -9.892130 | ... | 1.088224 | -2.185282 | -0.230979 | 21.335008 | -1.517562 | -0.445338 | 9.285682 | 0 | 0 | 0.8182 |
| 4 | 0.466752 | 10.563190 | 2.027941 | 2.226414 | 17.927492 | 34.170205 | -0.316748 | 8.300755 | 21.643071 | 9.464095 | ... | 0.065431 | 1.069949 | 0.827266 | -0.006643 | -7.881300 | -1.019437 | 7.875589 | 0 | 1 | 0.5544 |
5 rows × 51 columns
# from pycaret.utils import check_metric
# check_metric(unseen_predictions['y'], unseen_predictions['Label'], metric = 'fp10_fn500')
np.round(fp10_fn500_func(y_test=unseen_predictions['Label'], y_pred=unseen_predictions['y']),2)
17.0
xgboost = create_model('xgboost')
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | fp10_fn500 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.9237 | 0.9753 | 0.8919 | 0.9159 | 0.9037 | 0.8405 | 0.8407 | 22.0360 |
| 1 | 0.9192 | 0.9733 | 0.8847 | 0.9115 | 0.8979 | 0.8311 | 0.8313 | 23.4880 |
| 2 | 0.9210 | 0.9735 | 0.8895 | 0.9117 | 0.9005 | 0.8351 | 0.8352 | 22.5271 |
| Mean | 0.9213 | 0.9740 | 0.8887 | 0.9130 | 0.9007 | 0.8356 | 0.8358 | 22.6837 |
| SD | 0.0018 | 0.0009 | 0.0030 | 0.0020 | 0.0024 | 0.0039 | 0.0038 | 0.6030 |
# ?tune_model
?np.random.random
np.random.seed(42)
params = {
"min_child_weight": np.random.randint(5, 10, 10),
"max_depth": np.ceil(np.random.random(10)*20).astype(int).tolist(),
"n_estimators": np.ceil(np.random.random(10)*200).astype(int).tolist()
}
params
{'min_child_weight': array([8, 9, 7, 9, 9, 6, 7, 7, 7, 9]),
'max_depth': [13, 15, 1, 20, 17, 5, 4, 4, 7, 11],
'n_estimators': [87, 59, 123, 28, 59, 74, 92, 158, 40, 103]}
# https://pycaret.org/tune-model/
# tuned_xgboost = tune_model(xgboost) # Severely Overfits on completely unseen dataset
# tuned_xgboost = tune_model(xgboost, search_library='scikit-optimize') # Severely Overfits on completely unseen dataset
# tune hyperparameters with custom_grid to reduce overfitting
# https://xgboost.readthedocs.io/en/latest/parameter.html#parameters-for-tree-booster
start = time()
tuned_xgboost = tune_model(xgboost, custom_grid=params)
end = time()
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | fp10_fn500 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.9372 | 0.9805 | 0.9103 | 0.9318 | 0.9209 | 0.8689 | 0.8691 | 18.2753 |
| 1 | 0.9365 | 0.9801 | 0.9091 | 0.9312 | 0.9201 | 0.8675 | 0.8676 | 18.5146 |
| 2 | 0.9330 | 0.9793 | 0.9073 | 0.9244 | 0.9158 | 0.8601 | 0.8602 | 18.9130 |
| Mean | 0.9356 | 0.9799 | 0.9089 | 0.9291 | 0.9189 | 0.8655 | 0.8656 | 18.5676 |
| SD | 0.0019 | 0.0005 | 0.0012 | 0.0034 | 0.0023 | 0.0039 | 0.0039 | 0.2630 |
print(f"Training Time: {end-start}s")
Training Time: 334.8611195087433s
xgboost
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan,
monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
n_estimators=100, n_jobs=-1, num_parallel_tree=1,
objective='binary:logistic', random_state=42, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='gpu_hist', validate_parameters=1, verbosity=0)
tuned_xgboost
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=15,
min_child_weight=8, missing=nan,
monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
n_estimators=123, n_jobs=-1, num_parallel_tree=1,
objective='binary:logistic', random_state=42, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='gpu_hist', validate_parameters=1, verbosity=0)
plot_model(tuned_xgboost, plot='feature')
plot_model(tuned_xgboost, plot = 'confusion_matrix')
final_model_tuned = finalize_model(tuned_xgboost)
unseen_predictions = predict_model(final_model_tuned, data=test)
unseen_predictions.head()
| x0 | x1 | x3 | x4 | x5 | x7 | x8 | x9 | x10 | x11 | ... | x43 | x44 | x45 | x46 | x47 | x48 | x49 | y | Label | Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.226706 | 11.350364 | 5.182092 | -2.236454 | 10.716248 | -15.900329 | -0.178002 | 10.901306 | -0.090170 | -6.062488 | ... | -2.064860 | 0.923879 | 0.331452 | 19.172365 | 5.752749 | -2.609553 | -20.320179 | 0 | 0 | 0.9997 |
| 1 | -0.333224 | 1.138614 | 0.104432 | 0.819080 | 6.936425 | 37.988706 | 4.517952 | 17.467962 | 6.692915 | -21.920142 | ... | 1.938380 | -5.395413 | 0.459957 | -77.491333 | 0.754309 | -0.442017 | -21.824215 | 1 | 1 | 0.9531 |
| 2 | -0.479265 | -1.085311 | 5.535042 | -5.157181 | -1.608268 | -33.507600 | 26.144811 | -1.866048 | 3.351425 | -3.272850 | ... | -0.984782 | 1.954579 | 0.383549 | -1.467533 | 13.830803 | -2.831817 | 9.343167 | 0 | 0 | 0.9989 |
| 3 | 0.140972 | -13.770815 | 1.282171 | -10.314443 | -2.140678 | -32.697867 | -12.569104 | -0.510527 | -0.622231 | -9.892130 | ... | 1.088224 | -2.185282 | -0.230979 | 21.335008 | -1.517562 | -0.445338 | 9.285682 | 0 | 0 | 0.9429 |
| 4 | 0.466752 | 10.563190 | 2.027941 | 2.226414 | 17.927492 | 34.170205 | -0.316748 | 8.300755 | 21.643071 | 9.464095 | ... | 0.065431 | 1.069949 | 0.827266 | -0.006643 | -7.881300 | -1.019437 | 7.875589 | 0 | 1 | 0.9081 |
5 rows × 51 columns
# Confusion Matrix on completely unseen data
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=unseen_predictions['Label'], y_pred=unseen_predictions['y'])
array([[18196, 997],
[ 825, 11660]], dtype=int64)
np.round(fp10_fn500_func(y_test=unseen_predictions['Label'], y_pred=unseen_predictions['y']),2)
13.34
# SHAPLEY Explanation:
# https://www.analyticsvidhya.com/blog/2019/11/shapley-value-machine-learning-interpretability-game-theory/
# This shows the Shap values on the x-axis.
# Here, all the values on the left represent the observations that shift the predicted value in the negative direction
# while the points on the right contribute to shifting the prediction in a positive direction.
# All the features are on the left y-axis.
interpret_model(final_model_tuned, plot = 'summary')
catboost = create_model('catboost')
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | fp10_fn500 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.9280 | 0.9768 | 0.8968 | 0.9218 | 0.9091 | 0.8495 | 0.8497 | 21.0213 |
| 1 | 0.9250 | 0.9758 | 0.8931 | 0.9180 | 0.9053 | 0.8433 | 0.8435 | 21.7914 |
| 2 | 0.9227 | 0.9748 | 0.8919 | 0.9136 | 0.9026 | 0.8386 | 0.8388 | 22.0462 |
| Mean | 0.9252 | 0.9758 | 0.8939 | 0.9178 | 0.9057 | 0.8438 | 0.8440 | 21.6196 |
| SD | 0.0022 | 0.0008 | 0.0021 | 0.0033 | 0.0027 | 0.0045 | 0.0045 | 0.4357 |
# https://pycaret.org/tune-model/
# https://github.com/pycaret/pycaret/releases
# tuned_catboost = tune_model(catboost, search_library='scikit-optimize') # Does not work
# Fine tuning may need a custom grid like in the case of XGboost
tuned_catboost = tune_model(catboost)
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | fp10_fn500 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.9032 | 0.9649 | 0.8468 | 0.9061 | 0.8754 | 0.7965 | 0.7977 | 31.1151 |
| 1 | 0.9037 | 0.9646 | 0.8485 | 0.9057 | 0.8762 | 0.7975 | 0.7986 | 30.7780 |
| 2 | 0.9008 | 0.9627 | 0.8464 | 0.9005 | 0.8726 | 0.7915 | 0.7925 | 31.2131 |
| Mean | 0.9026 | 0.9641 | 0.8472 | 0.9041 | 0.8747 | 0.7951 | 0.7963 | 31.0354 |
| SD | 0.0013 | 0.0009 | 0.0009 | 0.0025 | 0.0015 | 0.0026 | 0.0027 | 0.1863 |
catboost.get_all_params()
{'nan_mode': 'Min',
'gpu_ram_part': 0.95,
'eval_metric': 'Logloss',
'iterations': 1000,
'leaf_estimation_method': 'Newton',
'observations_to_bootstrap': 'TestOnly',
'grow_policy': 'SymmetricTree',
'boosting_type': 'Plain',
'feature_border_type': 'GreedyLogSum',
'bayesian_matrix_reg': 0.10000000149011612,
'devices': '-1',
'pinned_memory_bytes': '104857600',
'l2_leaf_reg': 3,
'random_strength': 1,
'rsm': 1,
'boost_from_average': False,
'gpu_cat_features_storage': 'GpuRam',
'fold_size_loss_normalization': False,
'model_size_reg': 0.5,
'use_best_model': False,
'class_names': [0, 1],
'random_seed': 42,
'depth': 6,
'border_count': 32,
'min_fold_size': 100,
'data_partition': 'DocParallel',
'bagging_temperature': 1,
'classes_count': 0,
'auto_class_weights': 'None',
'leaf_estimation_backtracking': 'AnyImprovement',
'best_model_min_trees': 1,
'min_data_in_leaf': 1,
'add_ridge_penalty_to_loss_function': False,
'loss_function': 'Logloss',
'learning_rate': 0.026148999109864235,
'score_function': 'Cosine',
'task_type': 'GPU',
'leaf_estimation_iterations': 10,
'bootstrap_type': 'Bayesian',
'max_leaves': 64}
tuned_catboost.get_all_params()
{'nan_mode': 'Min',
'gpu_ram_part': 0.95,
'eval_metric': 'Logloss',
'iterations': 120,
'leaf_estimation_method': 'Newton',
'observations_to_bootstrap': 'TestOnly',
'grow_policy': 'SymmetricTree',
'boosting_type': 'Plain',
'feature_border_type': 'GreedyLogSum',
'bayesian_matrix_reg': 0.10000000149011612,
'devices': '-1',
'pinned_memory_bytes': '104857600',
'l2_leaf_reg': 6,
'random_strength': 0.30000001192092896,
'rsm': 1,
'boost_from_average': False,
'gpu_cat_features_storage': 'GpuRam',
'fold_size_loss_normalization': False,
'model_size_reg': 0.5,
'use_best_model': False,
'class_names': [0, 1],
'random_seed': 42,
'depth': 8,
'border_count': 32,
'min_fold_size': 100,
'data_partition': 'DocParallel',
'bagging_temperature': 1,
'classes_count': 0,
'auto_class_weights': 'None',
'leaf_estimation_backtracking': 'AnyImprovement',
'best_model_min_trees': 1,
'min_data_in_leaf': 1,
'add_ridge_penalty_to_loss_function': False,
'loss_function': 'Logloss',
'learning_rate': 0.029999999329447743,
'score_function': 'Cosine',
'task_type': 'GPU',
'leaf_estimation_iterations': 10,
'bootstrap_type': 'Bayesian',
'max_leaves': 256}
plot_model(tuned_catboost, plot = 'confusion_matrix')
final_model_tuned = finalize_model(tuned_catboost)
unseen_predictions = predict_model(final_model_tuned, data=test)
unseen_predictions.head()
| x0 | x1 | x3 | x4 | x5 | x7 | x8 | x9 | x10 | x11 | ... | x43 | x44 | x45 | x46 | x47 | x48 | x49 | y | Label | Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.226706 | 11.350364 | 5.182092 | -2.236454 | 10.716248 | -15.900329 | -0.178002 | 10.901306 | -0.090170 | -6.062488 | ... | -2.064860 | 0.923879 | 0.331452 | 19.172365 | 5.752749 | -2.609553 | -20.320179 | 0 | 0 | 0.9835 |
| 1 | -0.333224 | 1.138614 | 0.104432 | 0.819080 | 6.936425 | 37.988706 | 4.517952 | 17.467962 | 6.692915 | -21.920142 | ... | 1.938380 | -5.395413 | 0.459957 | -77.491333 | 0.754309 | -0.442017 | -21.824215 | 1 | 1 | 0.5005 |
| 2 | -0.479265 | -1.085311 | 5.535042 | -5.157181 | -1.608268 | -33.507600 | 26.144811 | -1.866048 | 3.351425 | -3.272850 | ... | -0.984782 | 1.954579 | 0.383549 | -1.467533 | 13.830803 | -2.831817 | 9.343167 | 0 | 0 | 0.9371 |
| 3 | 0.140972 | -13.770815 | 1.282171 | -10.314443 | -2.140678 | -32.697867 | -12.569104 | -0.510527 | -0.622231 | -9.892130 | ... | 1.088224 | -2.185282 | -0.230979 | 21.335008 | -1.517562 | -0.445338 | 9.285682 | 0 | 1 | 0.5257 |
| 4 | 0.466752 | 10.563190 | 2.027941 | 2.226414 | 17.927492 | 34.170205 | -0.316748 | 8.300755 | 21.643071 | 9.464095 | ... | 0.065431 | 1.069949 | 0.827266 | -0.006643 | -7.881300 | -1.019437 | 7.875589 | 0 | 1 | 0.5803 |
5 rows × 51 columns
np.round(fp10_fn500_func(y_test=unseen_predictions['Label'], y_pred=unseen_predictions['y']),2)
18.05
# SHAPLEY Explanation:
# https://www.analyticsvidhya.com/blog/2019/11/shapley-value-machine-learning-interpretability-game-theory/
# This shows the Shap values on the x-axis.
# Here, all the values on the left represent the observations that shift the predicted value in the negative direction
# while the points on the right contribute to shifting the prediction in a positive direction.
# All the features are on the left y-axis.
interpret_model(final_model_tuned, plot = 'summary')
logs = get_logs()
logs
| run_id | experiment_id | status | artifact_uri | start_time | end_time | metrics.Accuracy | metrics.MCC | metrics.Prec | metrics.TT | ... | tags.Source | tags.mlflow.source.name | tags.USI | tags.URI | tags.Run Time | tags.mlflow.runName | tags.mlflow.user | tags.Run ID | tags.mlflow.source.type | tags.mlflow.log-model.history | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | c7f25d07334b45aeab7f5bf8386acdf7 | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-27 09:54:35.017000+00:00 | 2020-11-27 09:54:38.644000+00:00 | 0.902300 | 0.795700 | 0.903800 | 3.8100 | ... | finalize_model | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 1249 | 34798336 | 19.64 | CatBoost Classifier | Nikhil | c7f25d07334b45aeab7f5bf8386acdf7 | LOCAL | [{"run_id": "c7f25d07334b45aeab7f5bf8386acdf7"... |
| 1 | f708e996411c4f06967eea57811f4306 | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-27 09:54:06.420000+00:00 | 2020-11-27 09:54:10.119000+00:00 | 0.902600 | 0.796300 | 0.904100 | 3.5800 | ... | tune_model | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 1249 | 36e9048a | 114.97 | CatBoost Classifier | Nikhil | f708e996411c4f06967eea57811f4306 | LOCAL | [{"run_id": "f708e996411c4f06967eea57811f4306"... |
| 2 | 2e90d0eb4da64210b53c1e134460fc74 | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-27 09:52:05.215000+00:00 | 2020-11-27 09:52:08.400000+00:00 | 0.925245 | 0.844001 | 0.917791 | 16.2800 | ... | create_model | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 1249 | 58d7152b | 68.81 | CatBoost Classifier | Nikhil | 2e90d0eb4da64210b53c1e134460fc74 | LOCAL | [{"run_id": "2e90d0eb4da64210b53c1e134460fc74"... |
| 3 | 06bfdb2641fe48888d85507b88c5e292 | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-27 09:47:26.565000+00:00 | 2020-11-27 09:47:31.059000+00:00 | 0.938300 | 0.871300 | 0.931800 | 20.8100 | ... | finalize_model | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 1249 | 9128cc46 | 82.71 | Extreme Gradient Boosting | Nikhil | 06bfdb2641fe48888d85507b88c5e292 | LOCAL | [{"run_id": "06bfdb2641fe48888d85507b88c5e292"... |
| 4 | 61e4821d99be4ebc8cf01fc3a82fef6f | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-27 09:45:51.582000+00:00 | 2020-11-27 09:45:55.420000+00:00 | 0.935600 | 0.865600 | 0.929100 | 17.1100 | ... | tune_model | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 1249 | ab7826f6 | 328.31 | Extreme Gradient Boosting | Nikhil | 61e4821d99be4ebc8cf01fc3a82fef6f | LOCAL | [{"run_id": "61e4821d99be4ebc8cf01fc3a82fef6f"... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 117 | 63c6c2b4bb164e28b9e61940b6c67c2e | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-24 20:58:14.181000+00:00 | 2020-11-24 20:58:14.962000+00:00 | 0.921300 | 0.835800 | 0.913000 | 4.4467 | ... | compare_models | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 9201 | b4786d58 | 13.53 | Extreme Gradient Boosting | Nikhil | 63c6c2b4bb164e28b9e61940b6c67c2e | LOCAL | [{"run_id": "63c6c2b4bb164e28b9e61940b6c67c2e"... |
| 118 | 68781a28ebe3482faac34aa489e750e6 | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-24 20:58:12.732000+00:00 | 2020-11-24 20:58:14.084000+00:00 | 0.925600 | 0.844600 | 0.918300 | 15.8967 | ... | compare_models | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 9201 | b4786d58 | 47.84 | CatBoost Classifier | Nikhil | 68781a28ebe3482faac34aa489e750e6 | LOCAL | [{"run_id": "68781a28ebe3482faac34aa489e750e6"... |
| 119 | e4e16e8c6de6422988bddfced3757713 | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-24 20:44:41.330000+00:00 | 2020-11-24 20:44:41.634000+00:00 | NaN | NaN | NaN | NaN | ... | setup | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 9201 | cdbd73c0 | 27.9 | Session Initialized 9201 | Nikhil | e4e16e8c6de6422988bddfced3757713 | LOCAL | None |
| 120 | f2aadf1f4f5b418e8ebe7b6ac5d3afcc | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-24 20:42:09.128000+00:00 | 2020-11-24 20:42:09.464000+00:00 | NaN | NaN | NaN | NaN | ... | setup | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 306d | 3058dff9 | 12.41 | Session Initialized 306d | Nikhil | f2aadf1f4f5b418e8ebe7b6ac5d3afcc | LOCAL | None |
| 121 | 08442a8b87124634b8482b300848c99e | 1 | FINISHED | file:///C:/Users/Nikhil/OneDrive%20-%20Souther... | 2020-11-24 20:34:23.185000+00:00 | 2020-11-24 20:34:23.574000+00:00 | NaN | NaN | NaN | NaN | ... | setup | C:\Users\Nikhil\.conda\envs\ds7337_cs3\lib\sit... | 2a60 | 519ded23 | 11.81 | Session Initialized 2a60 | Nikhil | 08442a8b87124634b8482b300848c99e | LOCAL | None |
122 rows × 209 columns
logs.iloc[0]['artifact_uri']
'file:///C:/Users/Nikhil/OneDrive%20-%20Southern%20Methodist%20University/SMU%20MSDS/202009/DS7333/ds7333_qtw/case_study_7/analysis/nikhil/mlruns/1/c7f25d07334b45aeab7f5bf8386acdf7/artifacts'